# -*- coding=utf-8 -*-

#单页数据来源：https://db.yaozh.com/hmap?grade=全部&p=1&pageSize=30&province=辽宁省&type=全部
#能在第一页获取到数据总数
#最大单页为30条记录
#单击事件链接补全：https://db.yaozh.com/+/hmap/14.html

import re
import os
import os.path
import urllib.request

from bs4 import BeautifulSoup
from demo_analysis_website.hospitalObj import hospitalObj

base_url = "https://db.yaozh.com/hmap"
root_url="https://db.yaozh.com/"

def request_web_source_body(url):
    regex_rule='<div class="offset-top table-list" data-max="300">([\s\S]*?)</div>'
    html_body_opener=urllib.request.urlopen(url).read()
    html_body_source=html_body_opener.decode("utf-8")
    #print(html_body_source)

    return get_regex_msg(html_body_source,regex_rule)

def get_page_obj(content):
    hospitallist=list()
    soup = BeautifulSoup(content, 'html.parser')
    # print(soup)
    trs = soup.find_all(name="tr")
    for tr in trs:
        _soup = BeautifulSoup(str(tr), 'html.parser')
        #print(_soup)

        tds = _soup.find_all(name="td")
        if (len(tds) != 0):
            hsobj = hospitalObj()
            ths = _soup.find_all(name="th")
            a_s=_soup.find_all(name="a")
            #print(a_s[0]['href'])  # 查a标签的href值
            hsobj.setHref(root_url+a_s[0]['href'])
            hsobj.setName(ths[0].getText())
            hsobj.setGrade(tds[0].getText())
            hsobj.setType(tds[1].getText())
            hsobj.setProvince(tds[2].getText())
            hsobj.setCity(tds[3].getText())
            hsobj.setTown(tds[4].getText())
            hsobj.setBedstr(tds[5].getText())
            hsobj.setAddress(tds[6].getText())
            hospitallist.append(hsobj)
        else:
            continue

    return hospitallist



def get_regex_msg(str,regex_pattern):
    return re.compile(regex_pattern).findall(str)


if __name__ == '__main__':

    province_name="四川省"
    require_param="?grade=全部&p=1&pageSize=30&province="+province_name+"&type=全部"
    request_web_page_source= request_web_source_body(base_url)[0]
    #print(request_web_page_source)
    #print(get_page_obj(request_web_page_source))
    hospital_msg_list=get_page_obj(request_web_page_source)
    for i in hospital_msg_list:
        print(i)

